Find SNPs in all MR genes

library(tidyverse)

get list of genes

genes <- read_csv("../output/MR_UN_graphs_node_annotation_2012.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  .id = col_character(),
  trt = col_character(),
  name = col_character(),
  chrom = col_character(),
  subject = col_character(),
  AGI = col_character(),
  At_symbol = col_character(),
  At_description = col_character()
)
See spec(...) for full column specifications.
genes

output BED file for use by samtools

genes %>%
  select(chrom, start, end, name) %>%
  mutate(start = start - 1) %>% # first base is base 0 in BED system.
  # but end base is not included, so leave that alone
  write_tsv("../output/MR_genes.bed", col_names = FALSE)

On whitney… Only use the UN files to keep things faster

for f in $(ls bamsnew/mapping_v1.5_merged_bam_files/*UN*bam)
do
  f=$(basename $f)
  echo $f
  samtools view -b bamsnew/mapping_v1.5_merged_bam_files/$f -L MR_genes.bed > subsetbams/$f
  samtools index subsetbams/$f
done

now run freebayes in parallel. First split up the bed file into chunks

split MR_genes.bed -n "l/12" MR_gene.bedx
parallel -j 12 freebayes --targets {} -f ~/Sequences/ref_genomes/B_rapa/genome/V1.5/Brapa_sequence_v1.5.fa subsetbams/*.bam  ::: *.bedx* > MRgenes.vcf

snpeff

java -jar snpEff/snpEff.jar  Brassica_rapa MRgenes.vcf > MRgenes_ann.vcf

now bring to R and process

vcf.header <- system("grep '#C' ../output/MRGenes_ann.vcf",intern = TRUE)  %>% 
  str_replace("#","")  %>% str_split(pattern = "\t") %>% 
  magrittr::extract2(1)
vcf.header
 [1] "CHROM"   "POS"     "ID"      "REF"     "ALT"    
 [6] "QUAL"    "FILTER"  "INFO"    "FORMAT"  "unknown"
vcf.data <- read_tsv("../output/MRGenes_ann.vcf",na = c("","NA","."),comment="#",col_names = vcf.header) %>%
  select(-FILTER) %>%
  mutate(ID=str_c(CHROM,"_",POS))
Parsed with column specification:
cols(
  CHROM = col_character(),
  POS = col_double(),
  ID = col_logical(),
  REF = col_character(),
  ALT = col_character(),
  QUAL = col_double(),
  FILTER = col_logical(),
  INFO = col_character(),
  FORMAT = col_character(),
  unknown = col_character()
)
vcf.data

split the genotype record:

vcf.data <- vcf.data %>%
  separate(unknown, into=c("GT", "DP", "AD", "RO", "QR", "AO", "QA", "GL"), convert = TRUE, sep=":")
vcf.data

Filter. Because all of the RILs are considered together, we want to keep SNPs that are heterozygous (meaning segregating)

vcf.data <- vcf.data %>% filter(!(GT %in% c("0/0", "1/1")))
vcf.data

filter by depth. We had a large number of samples go into this (like 400). Filter to require a depth of at least 400

vcf.data <- vcf.data %>% filter(DP > 400)
vcf.data

scanning through this, qualities and depth all look good so will not do further filtering.

now pull out the snp info…

first, pull out the full annotation ino

vcf.data <- vcf.data %>%
  mutate(ANN=str_extract(INFO,"ANN.*$"),
         ANN=str_remove(ANN, "ANN="))
head(vcf.data$ANN)
[1] "G|missense_variant|MODERATE|Bra037847|Bra037847|transcript|Bra037847|protein_coding|1/2|c.83C>G|p.Thr28Ser|83/711|83/711|28/236||,G|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4418C>G|||||4418|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"                                                                                                                                                                                              
[2] "G|synonymous_variant|LOW|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.234A>G|p.Pro78Pro|234/711|234/711|78/236||,G|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4663A>G|||||4663|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"                                                                                                                                                                                              
[3] "A|synonymous_variant|LOW|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.249C>A|p.Ile83Ile|249/711|249/711|83/236||,A|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4678C>A|||||4678|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"                                                                                                                                                                                              
[4] "T|synonymous_variant|LOW|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.256C>T|p.Leu86Leu|256/711|256/711|86/236||,T|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4685C>T|||||4685|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"                                                                                                                                                                                              
[5] "A|missense_variant|MODERATE|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.346C>A|p.His116Asn|346/711|346/711|116/236||,A|upstream_gene_variant|MODIFIER|Bra037848|Bra037848|transcript|Bra037848|protein_coding||c.-4966C>A|||||4966|,A|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4775C>A|||||4775|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"                                                                          
[6] "T|synonymous_variant|LOW|Bra008566|Bra008566|transcript|Bra008566|protein_coding|20/20|c.2634T>A|p.Leu878Leu|2634/2691|2634/2691|878/896||,T|upstream_gene_variant|MODIFIER|Bra008565|Bra008565|transcript|Bra008565|protein_coding||c.-265T>A|||||265|,T|downstream_gene_variant|MODIFIER|Bra008563|Bra008563|transcript|Bra008563|protein_coding||c.*3844A>T|||||3844|,T|downstream_gene_variant|MODIFIER|Bra008564|Bra008564|transcript|Bra008564|protein_coding||c.*881A>T|||||881|"

the tricky part is that each SNP may have a different number of annotations, so I need to create a list-column

vcf.data.filter <- vcf.data %>% 
  filter(Annotation_Impact != "LOW",
         Annotation_Impact != "MODIFIER") %>%
  arrange(CHROM, POS)
vcf.data.filter
vcf.data.filter <- vcf.data.filter %>% 
  mutate(IGV=str_c(CHROM, ":", POS, "-", POS)) %>%
  select(CHROM, POS, ID, IGV, everything())
vcf.data.filter
write_csv(vcf.data.filter, "MR_gene_Annotated_Filtered_SNPs.csv")
LS0tCnRpdGxlOiAiTVIgU05QcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKRmluZCBTTlBzIGluIGFsbCBNUiBnZW5lcwoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCmdldCBsaXN0IG9mIGdlbmVzCmBgYHtyfQpnZW5lcyA8LSByZWFkX2NzdigiLi4vb3V0cHV0L01SX1VOX2dyYXBoc19ub2RlX2Fubm90YXRpb25fMjAxMi5jc3YiKQpnZW5lcwpgYGAKCm91dHB1dCBCRUQgZmlsZSBmb3IgdXNlIGJ5IHNhbXRvb2xzCgpgYGB7cn0KZ2VuZXMgJT4lCiAgc2VsZWN0KGNocm9tLCBzdGFydCwgZW5kLCBuYW1lKSAlPiUKICBtdXRhdGUoc3RhcnQgPSBzdGFydCAtIDEpICU+JSAjIGZpcnN0IGJhc2UgaXMgYmFzZSAwIGluIEJFRCBzeXN0ZW0uCiAgIyBidXQgZW5kIGJhc2UgaXMgbm90IGluY2x1ZGVkLCBzbyBsZWF2ZSB0aGF0IGFsb25lCiAgd3JpdGVfdHN2KCIuLi9vdXRwdXQvTVJfZ2VuZXMuYmVkIiwgY29sX25hbWVzID0gRkFMU0UpCmBgYAoKCk9uIHdoaXRuZXkuLi4gIE9ubHkgdXNlIHRoZSBVTiBmaWxlcyB0byBrZWVwIHRoaW5ncyBmYXN0ZXIKYGBge2Jhc2gsIGV2YWw9RkFMU0V9CmZvciBmIGluICQobHMgYmFtc25ldy9tYXBwaW5nX3YxLjVfbWVyZ2VkX2JhbV9maWxlcy8qVU4qYmFtKQpkbwogIGY9JChiYXNlbmFtZSAkZikKICBlY2hvICRmCiAgc2FtdG9vbHMgdmlldyAtYiBiYW1zbmV3L21hcHBpbmdfdjEuNV9tZXJnZWRfYmFtX2ZpbGVzLyRmIC1MIE1SX2dlbmVzLmJlZCA+IHN1YnNldGJhbXMvJGYKICBzYW10b29scyBpbmRleCBzdWJzZXRiYW1zLyRmCmRvbmUKYGBgCgpub3cgcnVuIGZyZWViYXllcyBpbiBwYXJhbGxlbC4gIEZpcnN0IHNwbGl0IHVwIHRoZSBiZWQgZmlsZSBpbnRvIGNodW5rcwoKYGBge2Jhc2gsIGV2YWw9RkFMU0V9CnNwbGl0IE1SX2dlbmVzLmJlZCAtbiAibC8xMiIgTVJfZ2VuZS5iZWR4CmBgYAoKCmBgYHtiYXNoLCBldmFsPUZBTFNFfQpwYXJhbGxlbCAtaiAxMiBmcmVlYmF5ZXMgLS10YXJnZXRzIHt9IC1mIH4vU2VxdWVuY2VzL3JlZl9nZW5vbWVzL0JfcmFwYS9nZW5vbWUvVjEuNS9CcmFwYV9zZXF1ZW5jZV92MS41LmZhIHN1YnNldGJhbXMvKi5iYW0gIDo6OiAqLmJlZHgqID4gTVJnZW5lcy52Y2YKYGBgCgpzbnBlZmYKYGBge2Jhc2gsIGV2YWw9RkFMU0V9CmphdmEgLWphciBzbnBFZmYvc25wRWZmLmphciAgQnJhc3NpY2FfcmFwYSBNUmdlbmVzLnZjZiA+IE1SZ2VuZXNfYW5uLnZjZgpgYGAKCm5vdyBicmluZyB0byBSIGFuZCBwcm9jZXNzCgpgYGB7Un0KdmNmLmhlYWRlciA8LSBzeXN0ZW0oImdyZXAgJyNDJyAuLi9vdXRwdXQvTVJHZW5lc19hbm4udmNmIixpbnRlcm4gPSBUUlVFKSAgJT4lIAogIHN0cl9yZXBsYWNlKCIjIiwiIikgICU+JSBzdHJfc3BsaXQocGF0dGVybiA9ICJcdCIpICU+JSAKICBtYWdyaXR0cjo6ZXh0cmFjdDIoMSkKdmNmLmhlYWRlcgpgYGAKCmBgYHtyfQp2Y2YuZGF0YSA8LSByZWFkX3RzdigiLi4vb3V0cHV0L01SR2VuZXNfYW5uLnZjZiIsbmEgPSBjKCIiLCJOQSIsIi4iKSxjb21tZW50PSIjIixjb2xfbmFtZXMgPSB2Y2YuaGVhZGVyKSAlPiUKICBzZWxlY3QoLUZJTFRFUikgJT4lCiAgbXV0YXRlKElEPXN0cl9jKENIUk9NLCJfIixQT1MpKQp2Y2YuZGF0YQpgYGAKCnNwbGl0IHRoZSBnZW5vdHlwZSByZWNvcmQ6CmBgYHtyfQp2Y2YuZGF0YSA8LSB2Y2YuZGF0YSAlPiUKICBzZXBhcmF0ZSh1bmtub3duLCBpbnRvPWMoIkdUIiwgIkRQIiwgIkFEIiwgIlJPIiwgIlFSIiwgIkFPIiwgIlFBIiwgIkdMIiksIGNvbnZlcnQgPSBUUlVFLCBzZXA9IjoiKQp2Y2YuZGF0YQpgYGAKCkZpbHRlci4gIEJlY2F1c2UgYWxsIG9mIHRoZSBSSUxzIGFyZSBjb25zaWRlcmVkIHRvZ2V0aGVyLCB3ZSB3YW50IHRvIGtlZXAgU05QcyB0aGF0IGFyZSBoZXRlcm96eWdvdXMgKG1lYW5pbmcgc2VncmVnYXRpbmcpCmBgYHtyfQp2Y2YuZGF0YSA8LSB2Y2YuZGF0YSAlPiUgZmlsdGVyKCEoR1QgJWluJSBjKCIwLzAiLCAiMS8xIikpKQp2Y2YuZGF0YQpgYGAKCmZpbHRlciBieSBkZXB0aC4gIFdlIGhhZCBhIGxhcmdlIG51bWJlciBvZiBzYW1wbGVzIGdvIGludG8gdGhpcyAobGlrZSA0MDApLiAgRmlsdGVyIHRvIHJlcXVpcmUgYSBkZXB0aCBvZiBhdCBsZWFzdCA0MDAKCmBgYHtyfQp2Y2YuZGF0YSA8LSB2Y2YuZGF0YSAlPiUgZmlsdGVyKERQID4gNDAwKQp2Y2YuZGF0YQpgYGAKCnNjYW5uaW5nIHRocm91Z2ggdGhpcywgcXVhbGl0aWVzIGFuZCBkZXB0aCBhbGwgbG9vayBnb29kIHNvIHdpbGwgbm90IGRvIGZ1cnRoZXIgZmlsdGVyaW5nLgoKbm93IHB1bGwgb3V0IHRoZSBzbnAgaW5mby4uLgoKZmlyc3QsIHB1bGwgb3V0IHRoZSBmdWxsIGFubm90YXRpb24gaW5vCmBgYHtyfQp2Y2YuZGF0YSA8LSB2Y2YuZGF0YSAlPiUKICBtdXRhdGUoQU5OPXN0cl9leHRyYWN0KElORk8sIkFOTi4qJCIpLAogICAgICAgICBBTk49c3RyX3JlbW92ZShBTk4sICJBTk49IikpCmhlYWQodmNmLmRhdGEkQU5OKQpgYGAKCnRoZSB0cmlja3kgcGFydCBpcyB0aGF0IGVhY2ggU05QIG1heSBoYXZlIGEgZGlmZmVyZW50IG51bWJlciBvZiBhbm5vdGF0aW9ucywgc28gSSBuZWVkIHRvIGNyZWF0ZSBhIGxpc3QtY29sdW1uCgpgYGB7cn0KYW5uLmhlYWRlciA8LSBzdHJfc3BsaXQoIkFsbGVsZSB8IEFubm90YXRpb24gfCBBbm5vdGF0aW9uX0ltcGFjdCB8IEdlbmVfTmFtZSB8IEdlbmVfSUQgfCBGZWF0dXJlX1R5cGUgfCBGZWF0dXJlX0lEIHwgVHJhbnNjcmlwdF9CaW9UeXBlIHwgUmFuayB8IEhHVlMuYyB8IEhHVlMucCB8IGNETkEucG9zIC8gY0ROQS5sZW5ndGggfCBDRFMucG9zIC8gQ0RTLmxlbmd0aCB8IEFBLnBvcyAvIEFBLmxlbmd0aCB8IERpc3RhbmNlIHwgRVJST1JTIC8gV0FSTklOR1MgLyBJTkZPIiwgcGF0dGVybj1maXhlZCgifCIpKSAlPiUgbWFncml0dHI6OmV4dHJhY3QyKDEpICU+JSBzdHJfcmVtb3ZlX2FsbCgiICIpCgp2Y2YuZGF0YSA8LSB2Y2YuZGF0YSAlPiUKICBtdXRhdGUoQU5OID0gc3RyX3NwbGl0KEFOTiwgcGF0dGVybj0iLCIpKSAlPiUKICB1bm5lc3QoKSAlPiUKICBtdXRhdGUoQU5OID0gc3RyX3JlbW92ZShBTk4sIHBhdHRlcm4gPSAiOy4qJCIpKSAlPiUKICBzZXBhcmF0ZShBTk4sIGludG89YW5uLmhlYWRlciwgc2VwPSJcXHwiKQoKdmNmLmRhdGEKYGBgCgpgYGB7cn0KdmNmLmRhdGEuZmlsdGVyIDwtIHZjZi5kYXRhICU+JSAKICBmaWx0ZXIoQW5ub3RhdGlvbl9JbXBhY3QgIT0gIkxPVyIsCiAgICAgICAgIEFubm90YXRpb25fSW1wYWN0ICE9ICJNT0RJRklFUiIpICU+JQogIGFycmFuZ2UoQ0hST00sIFBPUykKdmNmLmRhdGEuZmlsdGVyCmBgYAoKYGBge3J9CnZjZi5kYXRhLmZpbHRlciA8LSB2Y2YuZGF0YS5maWx0ZXIgJT4lIAogIG11dGF0ZShJR1Y9c3RyX2MoQ0hST00sICI6IiwgUE9TLCAiLSIsIFBPUykpICU+JQogIHNlbGVjdChDSFJPTSwgUE9TLCBJRCwgSUdWLCBldmVyeXRoaW5nKCkpCnZjZi5kYXRhLmZpbHRlcgpgYGAKCmBgYHtyfQp3cml0ZV9jc3YodmNmLmRhdGEuZmlsdGVyLCAiLi4vb3V0cHV0L01SX2dlbmVfQW5ub3RhdGVkX0ZpbHRlcmVkX1NOUHMuY3N2IikKYGBgCgo=